II)Parte 1
Exercicío 1
library(readr)
library(janitor)
library(ggplot2)
caminho <- paste0("C:/Users/monal/Downloads/Curso Ciencias de Dados/us_change.rda")
dados <- load(caminho)
us_change <- janitor::clean_names(us_change) %>%
tibble::as_tibble()
#ver o começo do DataSet
head(us_change)
#verdetalhes do Dataset
str(us_change)
tibble [198 x 6] (S3: tbl_df/tbl/data.frame)
$ quarter : yearquarter[1:198], format: "1970-01-01" "1970-04-01" ...
$ consumption : num [1:198] 0.619 0.452 0.873 -0.272 1.901 ...
$ income : num [1:198] 1.04 1.23 1.59 -0.24 1.98 ...
$ production : num [1:198] -2.452 -0.551 -0.359 -2.186 1.91 ...
$ savings : num [1:198] 5.3 7.79 7.4 1.17 3.54 ...
$ unemployment: num [1:198] 0.9 0.5 0.5 0.7 -0.1 ...
- attr(*, "key")= tibble [1 x 1] (S3: tbl_df/tbl/data.frame)
..$ .rows:List of 1
.. ..$ : int [1:198] 1 2 3 4 5 6 7 8 9 10 ...
- attr(*, "index")= chr "Quarter"
..- attr(*, "ordered")= logi TRUE
- attr(*, "index2")= chr "Quarter"
- attr(*, "interval")=List of 12
..$ year : num 0
..$ quarter : num 1
..$ month : num 0
..$ week : num 0
..$ day : num 0
..$ hour : num 0
..$ minute : num 0
..$ second : num 0
..$ millisecond: num 0
..$ microsecond: num 0
..$ nanosecond : num 0
..$ unit : num 0
..- attr(*, "class")= chr "interval"
#primeiro tem que filtrar em dois: 1 para maior que 2000 e 2 para menor que 2000
data_filter_1 <- us_change %>%
dplyr::select("quarter","unemployment") %>%
dplyr::filter(quarter >= "2000-01-01")
data_filter_2 <- us_change %>%
dplyr::select("quarter","unemployment") %>%
dplyr::filter(quarter < "2000-01-01")
#transformar o primeiro
data.nivel1 <- data_filter_1
data.nivel1$unemployment[1] <- 100;
for(i in 2:length(data.nivel1$unemployment)){
data.nivel1$unemployment[i] <- (1 + data_filter_1$unemployment[i]/100) * data.nivel1$unemployment[i-1]
}
#salvar o primeiro valor
start <- data.nivel1$unemployment[1]
#transformar o segundo
data.nivel2 <- data_filter_2
data.nivel2$unemployment[1] <- (1 + data.nivel2$unemployment[1]/100)* start
for(i in 2:length(data.nivel2$unemployment)){
data.nivel2$unemployment[i] <- (1 + data_filter_2$unemployment[i]/100) * data.nivel2$unemployment[i-1]
}
#Agora juntando os datas set
data <- data.nivel2 %>%
dplyr::full_join(data.nivel1) %>%
tibble::as_tibble()
Joining, by = c("quarter", "unemployment")
Letra B
correl <- cor(us_change %>%
select(-"quarter")) %>% round(2)
print(correl)
consumption income production savings unemployment
consumption 1.00 0.38 0.53 -0.26 -0.53
income 0.38 1.00 0.27 0.72 -0.22
production 0.53 0.27 1.00 -0.06 -0.77
savings -0.26 0.72 -0.06 1.00 0.11
unemployment -0.53 -0.22 -0.77 0.11 1.00
corrplot::corrplot(correl,
type = "upper",
tl.col = "black",
)
correl1 <- cor(us_change_nivel %>% dplyr::select(-"quarter")) %>% round(2)
print(correl1)
consumption income production savings unemployment
consumption 1.00 0.38 0.53 -0.26 -0.03
income 0.38 1.00 0.27 0.72 -0.07
production 0.53 0.27 1.00 -0.06 -0.07
savings -0.26 0.72 -0.06 1.00 -0.01
unemployment -0.03 -0.07 -0.07 -0.01 1.00
corrplot::corrplot(correl1,
type = "upper",
tl.col = "black",
)
Sim, apresentam diferenças porque no primeiro temos o valor “unemployment” em taxa de variação e os outros dados em números, “atrapalhando” a correlação entre os valores e alterando a análise. No segundo, temos todas as colunas na mesma unidade, logo a análise fica confiável.
Letra C
grafico_data <- data %>%
ggplot(aes(x = quarter, y = unemployment)) +
geom_line() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Evolução do desemprego",
y = "Quantidade",
x = "Anos",
colour = "quarter",
caption = "Gapminder")
plotly::ggplotly(grafico_data)
NA
#Gráfico 1
grafico_data1 <- data %>%
dplyr::filter(unemployment == max(unemployment)) %>%
ggplot(aes(x = quarter, y = unemployment)) +
geom_point() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Maior taxa de desemprego",
y = "Quantidade",
x = "Anos",
colour = "quarter",
caption = "Gapminder")
plotly::ggplotly(grafico_data1)
NA
NA
#Gráfico 2
grafico_data2 <- data %>%
ggplot(aes(x = unemployment)) +
geom_histogram() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Evolução do Numero de Indice de desemprego",
y = "Quantidade",
x = "Número Indice",
colour = "quarter",
caption = "Gapminder")
plotly::ggplotly(grafico_data2)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Gráfico 3
grafico_data3 <- us_change %>%
ggplot(aes(x = quarter, y=unemployment)) +
geom_line() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Evolução da Taxa de desemprego",
y = "Quantidade",
x = "Anos",
colour = "quarter",
caption = "Gapminder")
plotly::ggplotly(grafico_data3)
NA
Letra D
Observando os gráficos é possível visualizar a alta variação existe ao longo dos anos. Observa-se que houve uma máxima taxa de desemprego no gráfico 4, em outubro. Existe uma grande taxa de variação como demonstra o gráfico de variação. Bem como também tem-se grande variação no número índice. No histograma tem-se a quantidade de desemprego por número índice ao longo dos anos.
Letra E
Sim, várias mudanças brucas ao longo do tempo. Temos o periodo de 1980 até 2000, onde o indice de desemprego foi maior registrado e próximo do ano 2000 tem-se o menor índice.
Exercício 2
Letra A - O cuidado adicional inserido na importação do arquivo Retail.xlsx foi o (ship=1) para pular uma linha,porque a planilha do Excel possui duas linhas de cabeçalho.
dados_2 <- readxl::read_excel(path = "C:/Users/monal/Downloads/Curso Ciencias de Dados/retail.xlsx",skip=1) %>%
janitor::clean_names() %>%
tibble::as_tibble()
Letra B - Para transformar o tipo POSIXct na coluna Series_id utilizou-se a biblioteca “lubridate” com a função (as.date())
dados_2 <- dados_2 %>%
dplyr::mutate(
series_id = lubridate::as_date(series_id)
)
Letra C e D - Essa questão foram feitas juntas de forma a observar a sazonalidade e a têndencia bem como também o outlier ou padrões anômalos. Decompondo pela série utilizando o método X11, observou-se datas especificas de picos de vendas analisada no data set importado.
dados2.ts <- dados_2 %>%
dplyr::select(c("a3349335t")) %>%
ts(
start = c(
lubridate::year(dplyr::first(dados_2$series_id)),
lubridate::month(dplyr::first(dados_2$series_id))),
end = c(
lubridate::year(dplyr::last(dados_2$series_id)),
lubridate::month(dplyr::last(dados_2$series_id))),
frequency = 12
) %>%
.[,1]
grafico_1 <- autoplot(dados2.ts) + ggtitle("A3349335T") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Anos") +
ylab("Total de vendas por Dolar$'000s")
grafico_2 <- ggseasonplot(dados2.ts, year.labels=TRUE, year.labels.left=TRUE) +
theme(plot.title = element_text(hjust = 0.5)) +
ylab("Total de vendas em dolar$'000s por Ano") +
ggtitle("Evolução por estação : A3349335T")
grafico_3 <- ggmonthplot(dados2.ts) +
theme(plot.title = element_text(hjust = 0.5)) +
ylab("Total de vendas em dolar$'000s") +
ggtitle("Evolução por Mês: A3349335T")
dados2ts2 <- window(dados2.ts, start=1982)
'start' value not changed
grafico_4 <- gglagplot(dados2ts2) +
ggtitle("Venda mensal") +
theme(plot.title = element_text(hjust = 0.5))
decomposicao <- stl(dados2.ts, s.window = "periodic", robust = TRUE) %>%
pluck("time.series") %>%
as_tibble() %>%
mutate(
date = seq(from = lubridate::as_date(dplyr::first(dados_2$series_id)),
to = lubridate::as_date(dplyr::last(dados_2$series_id)),
by = "months")
) %>%
pivot_longer(
cols = -"date",
names_to = "decomposition",
values_to = "valores"
)
grafico_5 <- decomposicao %>%
filter(decomposition == "seasonal") %>%
ggplot(aes(x = date, y = valores)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_line()+
labs(title = "Sazonalidade",
y = "Quantidade",
x = "Anos",
caption = "Gapminder")
grafico_6 <- decomposicao %>%
filter(decomposition == "trend") %>%
ggplot(aes(x = date, y = valores)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_line() +
labs(title = "Tendência",
y = "Quantidade",
x = "Anos",
caption = "Gapminder")
outlier <- dados_2 %>%
anomalize::time_decompose("a3349335t") %>%
anomalize::anomalize(remainder) %>%
anomalize::time_recompose() %>%
dplyr::filter(anomaly == "Yes") %>%
dplyr::select(series_id)
Converting from tbl_df to tbl_time.
Auto-index message: index = series_id
frequency = 12 months
trend = 60 months
#Analise dos Dados
plotly::ggplotly(grafico_1)
#GGseasonplot
plotly::ggplotly(grafico_2)
#ggmonthplot
plotly::ggplotly(grafico_3)
#Exploração dos dados
plotly::ggplotly(grafico_4)
Aspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/widthAspect ratios aren't yet implemented, but you can manually set a suitable height/width
#Sazonalidade
plotly::ggplotly(grafico_5)
#Tendencia
plotly::ggplotly(grafico_6)
outlier
# A time tibble: 8 x 1
[90m# Index: series_id[39m
series_id
[3m[90m<date>[39m[23m
[90m1[39m 2007-12-01
[90m2[39m 2008-12-01
[90m3[39m 2009-11-01
[90m4[39m 2009-12-01
[90m5[39m 2010-12-01
[90m6[39m 2011-12-01
[90m7[39m 2012-12-01
[90m8[39m 2013-12-01
Parte - 2
Exercício 1 - Spotify
library(gapminder)
library(tidyverse)
library(janitor)
songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
Parsed with column specification:
cols(
.default = col_double(),
track_id = [31mcol_character()[39m,
track_name = [31mcol_character()[39m,
track_artist = [31mcol_character()[39m,
track_album_id = [31mcol_character()[39m,
track_album_name = [31mcol_character()[39m,
track_album_release_date = [31mcol_character()[39m,
playlist_name = [31mcol_character()[39m,
playlist_id = [31mcol_character()[39m,
playlist_genre = [31mcol_character()[39m,
playlist_subgenre = [31mcol_character()[39m
)
See spec(...) for full column specifications.
songs<- songs %>%
janitor::clean_names() %>%
tibble::as_tibble()
#Analisando dos dados
data_songs_artist <- songs %>%
dplyr::select("energy","track_popularity","track_artist","track_album_release_date")
grafic_1 <- data_songs_artist %>%
dplyr::filter(track_popularity > 90) %>%
dplyr::filter(track_album_release_date >= "2019-06-01") %>%
ggplot(aes(x = track_album_release_date, y = track_popularity)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point() +
labs(title = "Popularidade (maior 90) entre os artistas ",
subtitle = "Por data",
y = "Popularidade",
x = "Data",
colour = "track_artist",
caption = "Gapminder")
grafic_2 <- data_songs_artist %>%
dplyr::filter(track_popularity > 90) %>%
dplyr::filter(energy > 0.7) %>%
ggplot(aes(x = energy, y=track_popularity)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_line() +
labs(title = "Popularidade (maior 90) entre os artistas ",
subtitle = "Por energia (maior 0.7)",
y = "Popularidade",
x = "Energy",
colour = "track_artist",
caption = "Gapminder")
grafic_3 <- data_songs_artist %>%
dplyr::filter(energy == max(energy)) %>%
ggplot(aes(x = track_popularity, y = energy)) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point() +
labs(title = "Maior Energy ",
subtitle = "Por energia",
y = "Energy",
x = "Popularidade",
colour = "track_artist",
caption = "Gapminder")
grafic_4 <- data_songs_artist %>%
ggplot() +
geom_histogram(aes(energy)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Quantidade de musica",
subtitle = "Por energia",
y = "Count music",
x = "Energy",
caption = "Gapminder")
songs <- songs %>%
dplyr::mutate(track_album_release_date = lubridate::as_date(track_album_release_date))
1886 failed to parse.
data_songs <- songs %>%
dplyr::select("track_popularity","track_album_release_date",
"track_album_name")
grafic_5 <- data_songs %>%
dplyr::filter(track_popularity > 80) %>%
ggplot(aes(x = track_album_release_date, y= track_popularity)) +
geom_line()+
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Popularidade entre os artistas ",
subtitle = "Por data de lançamento",
y = "Popularidade",
x = "Data de Lançamento",
colour = "track_album_name",
caption = "Gapminder")
#Analise 1- Spotify
plotly::ggplotly(grafic_1)
#Analise 2 -Spotify
plotly::ggplotly(grafic_2)
#Analise 3 -Spotify
plotly::ggplotly(grafic_3)
#Analise 4 -Spotify
plotly::ggplotly(grafic_4)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Analise 5 - Spotify
plotly::ggplotly(grafic_5)
Exercício 2 - Video Games
library(tidyverse)
library(lubridate)
#importanto o DataFrame
video_games <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-30/video_games.csv")
Parsed with column specification:
cols(
number = [32mcol_double()[39m,
game = [31mcol_character()[39m,
release_date = [31mcol_character()[39m,
price = [32mcol_double()[39m,
owners = [31mcol_character()[39m,
developer = [31mcol_character()[39m,
publisher = [31mcol_character()[39m,
average_playtime = [32mcol_double()[39m,
median_playtime = [32mcol_double()[39m,
metascore = [32mcol_double()[39m
)
head(video_games)
video_games <- video_games %>%
janitor::clean_names() %>%
dplyr::mutate(
release_date = mdy(release_date)
)
1 failed to parse.
data_filter_price <- video_games %>%
dplyr::select("release_date","price") %>%
dplyr::filter(price < 0.99)
graf_1 <- data_filter_price %>%
ggplot() +
geom_line(aes(x = price, y = release_date)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Comparação dos preços dos jogos ",
subtitle = "Por data",
y = "Preço (< 0,99)",
x = "Ano",
colour = "price",
caption = "Gapminder")
graf_2 <- data_filter_price %>%
dplyr::filter(price == max(price)) %>%
ggplot() +
geom_point(aes(x = release_date, y = price)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Evolução dos preços dos jogos",
subtitle = "Por data",
y = "Preço (em dolares)",
x = "Ano",
caption = "Gapminder")
graf_3 <- data_filter_price %>%
ggplot() +
geom_histogram(aes(price)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Quantidade de jogos por preço",
subtitle = "Por preço (Price < 0.99)",
x = "Preço (em dolares)",
y = "Count games",
caption = "Gapminder")
data_filter_game <- video_games %>%
dplyr::select("game","metascore") %>%
janitor::clean_names() %>%
tibble::as_tibble() %>%
tidyr::drop_na()
graf_4 <- data_filter_game %>%
ggplot() +
geom_histogram(aes(metascore)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Metascore",
subtitle = "Por game",
x = "Metascore",
y = "Count games",
caption = "Gapminder")
#Análise 1 - Video Games
plotly::ggplotly(graf_1)
#Análise 2 - Video Games
plotly::ggplotly(graf_2)
#Análise 3 - Video Games
plotly::ggplotly(graf_3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Análise 4 - Video Games
plotly::ggplotly(graf_4)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.